In [8]:
#imports
import nltk
from utils.data.readCorpus import NltkCorpusFromDir
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.stats import variation
In [3]:
#prepare the corpus
latinise = NltkCorpusFromDir(root="/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas", fileids=r".*\.txt")
In [4]:
#prepare corpus metadata
filenames = latinise.fileids()
filenames = pd.DataFrame([(fname, fname.split('_')[2]) for fname in filenames], columns=["filename","id"])
metadata = pd.read_csv("/media/HOME_FOLDERS/krzys/Kod/lvlt22/BMG/latinise_metadata.csv", index_col="id")
metadata = metadata.merge(filenames,on="id")
metadata = metadata.drop_duplicates('id')
metadata = metadata.set_index('filename')
metadata["no_tokens"] = [  len(latinise.words(filename)) if filename in latinise.fileids() else 0 in filename  for filename in metadata.index.tolist() ]
In [6]:
# cut the corpus
metadata = metadata[metadata["no_tokens"] > 0]
bins_test = []
for binstep in range(1,500,1):
    for binstart in range(-450,0,binstep):
        bins=(range(binstart,850,binstep))
        tmp = pd.DataFrame(data={'count':metadata["no_tokens"],
                                 'period':pd.cut(metadata["date"], bins=bins, include_lowest=True)}).dropna()
        bins_test.append([binstart, binstep, len(bins),
             tmp.reset_index().groupby("period")["filename"].count(),
             tmp.reset_index().groupby("period")["count"].sum()])
In [10]:
# prepare the df
bins_df = pd.DataFrame(bins_test)
bins_df.columns = ["start", "step", "length", "files", "tokens"]

# compute variation of file counts per period
bins_df["var_files"] = bins_df.apply(lambda x: variation(x["files"]), axis=1)
# compute variation of token counts per period
bins_df["var_tokens"] = bins_df.apply(lambda x: variation(x["tokens"]), axis=1)

bins_df[(bins_df["start"] == -450) & (bins_df["length"] > 3)].sort_values("var_tokens", ascending=True).head()
Out[10]:
start step length files tokens var_files var_tokens
2826 -450 233 6 period (-450.001, -217.0] 2 (-217.0, 16.0]... period (-450.001, -217.0] 803 (-217.0, ... 0.586747 0.564911
2824 -450 232 6 period (-450.001, -218.0] 2 (-218.0, 14.0]... period (-450.001, -218.0] 803 (-218.0, ... 0.586747 0.564911
2822 -450 231 6 period (-450.001, -219.0] 2 (-219.0, 12.0]... period (-450.001, -219.0] 803 (-219.0, ... 0.587592 0.565472
2812 -450 226 6 period (-450.001, -224.0] 2 (-224.0, 2.0] ... period (-450.001, -224.0] 803 (-224.0, ... 0.609080 0.567348
2814 -450 227 6 period (-450.001, -223.0] 2 (-223.0, 4.0] ... period (-450.001, -223.0] 803 (-223.0, ... 0.609080 0.567348
In [11]:
# plot the results
fig = px.line(bins_df[(bins_df["start"] == -450)], x="step", y=["var_tokens","var_files"], hover_data=["tokens", "files"])
fig.show()